The data on constraint was obtained from: Christmas, M. J., Kaplow, I. M., Genereux, D. P., Dong, M. X., Hughes, G. M., Li, X., Sullivan, P. F., Hindle, A. G., Andrews, G., Armstrong, J. C., Bianchi, M., Breit, A. M., Diekhans, M., Fanter, C., Foley, N. M., Goodman, D. B., Goodman, L., Keough, K. C., Kirilenko, B., … Karlsson, E. K. (2023). Evolutionary constraint and innovation across hundreds of placental mammals. Science, 380(6643), eabn3943.
set the working directory, open the data and load packages.
setwd("C:/Users/kcwol/Downloads/mammalperspective_viperdata")
df <- read.csv("C:/Users/kcwol/Downloads/mammalperspective_viperdata/mammals_stats6_nodfa.csv")
library(plotly)
library(webshot)
library(ggplot2)
library(dplyr)
library(orca)
library(bestNormalize)
library(effectsize)
library(emmeans)
library(dunn.test)
library(broom)
library(rstatix)
library(scico)
library(sjstats)
library(kableExtra)
library(EnvStats)
library(Hmisc)
library(smotefamily)
library(MASS)
library(e1071)
library(caTools)
library(nnet)
library(caret)
library(viridis)
library(scales)
Check for outliers in the response variable mean Phylop
Figure. Boxplot of mean Phylop
Determine whether there are any outliers in the response variable
test <- rosnerTest(df$meanPhylop,
k = 10)
test
##
## Results of Outlier Test
## -------------------------
##
## Test Method: Rosner's Test for Outliers
##
## Hypothesized Distribution: Normal
##
## Data: df$meanPhylop
##
## Number NA/NaN/Inf's Removed: 1173
##
## Sample Size: 15784
##
## Test Statistics: R.1 = 4.010473
## R.2 = 3.884635
## R.3 = 3.836776
## R.4 = 3.699596
## R.5 = 3.683630
## R.6 = 3.651400
## R.7 = 3.582222
## R.8 = 3.545119
## R.9 = 3.539957
## R.10 = 3.526327
##
## Test Statistic Parameter: k = 10
##
## Alternative Hypothesis: Up to 10 observations are not
## from the same Distribution.
##
## Type I Error: 5%
##
## Number of Outliers Detected: 0
##
## i Mean.i SD.i Value Obs.Num R.i+1 lambda.i+1 Outlier
## 1 0 3.500269 1.438392 -2.268365 2678 4.010473 4.658103 FALSE
## 2 1 3.500634 1.437705 -2.084323 418 3.884635 4.658090 FALSE
## 3 2 3.500988 1.437063 -2.012699 9721 3.836776 4.658076 FALSE
## 4 3 3.501338 1.436438 -1.812902 2571 3.699596 4.658063 FALSE
## 5 4 3.501675 1.435860 -1.787503 5622 3.683630 4.658050 FALSE
## 6 5 3.502010 1.435288 -1.738800 1589 3.651400 4.658037 FALSE
## 7 6 3.502342 1.434727 -1.637168 14664 3.582222 4.658024 FALSE
## 8 7 3.502668 1.434189 -1.581702 1115 3.545119 4.658011 FALSE
## 9 8 3.502990 1.433663 -1.572114 714 3.539957 4.657998 FALSE
## 10 9 3.503312 1.433138 -1.550403 1953 3.526327 4.657985 FALSE
There are no outliers. Make a histogram for mean PhyloP:
plot <- ggplot(df, aes(x = meanPhylop)) +
geom_histogram(binwidth = 0.1, fill = "#002051", color = "black") +
labs(title = "Distribution of median of the Mean PhyloP Values") + theme_minimal()
# Save the histogram as a PDF
ggsave("Hist_meanphylop._untransformed.pdf", width = 10, height = 10)
## Warning: Removed 1173 rows containing non-finite values (`stat_bin()`).
Determine best method for normal transformation of response variable
df$meanPhylop <- as.numeric(df$meanPhylop)
(bn_meanPhylop <- bestNormalize(df$meanPhylop))
## Warning: `progress_estimated()` was deprecated in dplyr 1.0.0.
## ℹ The deprecated feature was likely used in the bestNormalize package.
## Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Best Normalizing transformation with 15784 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - arcsinh(x): 28.3016
## - Center+scale: 7.4771
## - Double Reversed Log_b(x+a): 1.6494
## - Exp(x): 45.2308
## - Log_b(x+a): 18.3097
## - orderNorm (ORQ): 1.1137
## - sqrt(x + a): 11.7974
## - Yeo-Johnson: 3.2102
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## orderNorm Transformation with 15784 nonmissing obs and ties
## - 15779 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## -2.268 2.669 3.761 4.580 6.762
Transform the response variable with orderNorm transformation. This snippet may be repeated below as datasets are split, merged and otherwise transformed.
BN_meanPhylop <- orderNorm(df$meanPhylop)
orderNorm_meanPhylop <- predict(BN_meanPhylop)
df <- cbind(df, orderNorm_meanPhylop)
Make a histogram for order Norm transformed dependent variable “mean PhyloP”
ggplot(df, aes(x = orderNorm_meanPhylop)) +
geom_histogram(binwidth = 0.1, fill = "#fdea45", color = "black") +
labs(title = "Distribution of median orderNorm_Mean PhyloP Values") + theme_minimal()
ggsave("Hist_meanphylop_oNtransformed.pdf", width = 10, height = 10)
We use Average Shortest Path Length (ASPL), BetweeennessCentrality (BC), and Neighborhood Connectivity (NC). These values have been extracted from the human interactome using the network analysis function in Cytoscape
Create 3D scatterplot for ASPL, BC, and NC with R (plotly library) to see variation in order Norm transformed mean Phylop
df$orderNorm_meanPhylop <- as.numeric(df$orderNorm_meanPhylop)
NWstats_meanphylop_3Dscatter <- plot_ly(df, type = "scatter3d", mode = "markers",
x = ~AverageShortestPathLength, y = ~BetweennessCentrality, z = ~NeighborhoodConnectivity,
marker = list(size = 8, opacity = 0.5, color = ~orderNorm_meanPhylop, colorscale = 'Cividis',colorbar = list(title = "oN mean PhyloP", tickmode = "auto", ticks = "outside", ticklen = 10), text = ~paste("oN_meanPhylop: ", orderNorm_meanPhylop))) %>%
layout(scene = list(xaxis = list(type = "log", title = "log_ASPL"),
yaxis = list(type = "log", title = "log_BC"),
zaxis = list(type = "log", title = "log_NC")))
NWstats_meanphylop_3Dscatter
Reduce the dimensionality of the data to better visualize the trend of constraint values changing across the network (to n=300 points), by slicing it into equidistant cubes and calculating mean values of (oN_mean_Phylop) per each cube
#make dataframe copy
original_data <- df
num_final_points <- 300
num_cubes <- round(num_final_points^(1/3))
x_breaks <- quantile(original_data$AverageShortestPathLength, probs = seq(0, 1, length.out = num_cubes + 1), na.rm = TRUE)
y_breaks <- quantile(original_data$BetweennessCentrality, probs = seq(0, 1, length.out = num_cubes + 1), na.rm = TRUE)
z_breaks <- quantile(original_data$NeighborhoodConnectivity, probs = seq(0, 1, length.out = num_cubes + 1), na.rm = TRUE)
original_data$Cube <- cut(original_data$AverageShortestPathLength, breaks = x_breaks, labels = FALSE) +
num_cubes * cut(original_data$BetweennessCentrality, breaks = y_breaks, labels = FALSE) +
num_cubes^2 * cut(original_data$NeighborhoodConnectivity, breaks = z_breaks, labels = FALSE)
Create a 3D scatter plot of the cube identities for all points
cube_figure <- plot_ly(original_data, type = "scatter3d", mode = "markers",
x = ~AverageShortestPathLength, y = ~BetweennessCentrality, z = ~NeighborhoodConnectivity,
marker = list(size = 8, opacity = 0.5, color = ~Cube, colorscale = 'Viridis', colorbar = list(title = "oN mean PhyloP", tickmode = "auto", ticks = "outside", ticklen = 10), text = ~paste("Cube_number: ", Cube))) %>%
layout(scene = list(xaxis = list(type = "log", title = "log ASPL"),
yaxis = list(type = "log", title = "log BC"),
zaxis = list(type = "log", title = "log NC")))
cube_figure
Bin each cube’s datapoints into one average point using means of coordinates
mean_cube_coordinates <- aggregate(cbind(AverageShortestPathLength, BetweennessCentrality, NeighborhoodConnectivity) ~ Cube, data = original_data, mean, na.rm = TRUE)
binned_cube_figure <- plot_ly(mean_cube_coordinates, type = "scatter3d", mode = "markers",
x = ~AverageShortestPathLength, y = ~BetweennessCentrality, z = ~NeighborhoodConnectivity,
marker = list(size = 8, opacity = 0.5, color = ~Cube, colorscale = 'Viridis', colorbar = list(title = "Cube bins", tickmode = "auto", ticks = "outside", ticklen = 10), text = ~paste("Cube: ", Cube))) %>%
layout(scene = list(xaxis = list(type = "log", title = "log ASPL"),
yaxis = list(type = "log", title = "log BC"),
zaxis = list(type = "log", title = "log NC")))
binned_cube_figure
calculate the average order Norm transformed mean Phylop for each cube
mean_phylop_coordinates_per_cube_mean <- aggregate(cbind(AverageShortestPathLength, BetweennessCentrality, NeighborhoodConnectivity, meanPhylop, orderNorm_meanPhylop) ~ Cube, data = original_data, mean, na.rm = TRUE)
#save this new datasheet to csv
write.csv(original_data,"mammals_stats_cubed2.csv", row.names=FALSE)
also flip the axis so that smallest ASPL meets largest BC to better represent the source network as: outside (large ASPL, small BC) vs. inside (small ASPL, large BC). Filter out one outlier in BC and plot log values
#subset dataframe
filtered_data <- mean_phylop_coordinates_per_cube_mean %>%
filter(BetweennessCentrality < 0.002)
color_scale_min <- quantile(filtered_data$orderNorm_meanPhylop, probs = 0.1)
color_scale_max <- quantile(filtered_data$orderNorm_meanPhylop, probs = 0.8)
Binned_NWstats_meanphylop_3Dscatter <- plot_ly(filtered_data, type = "scatter3d", mode = "markers",
x = ~AverageShortestPathLength, y = ~BetweennessCentrality, z = ~NeighborhoodConnectivity,
marker = list(size = 8, opacity = 0.5,
color = ~orderNorm_meanPhylop,
colorscale='Cividis',
cmin = color_scale_min,
cmax = color_scale_max,
colorbar = list(title = "oN Mean PhyloP", tickmode = "auto", ticks = "outside", ticklen = 10))) %>%
layout(scene = list(xaxis = list(type = "linear", title = "ASPL", autorange="reversed"),
yaxis = list(type = "log", title = "BC", autorange="reversed"),
zaxis = list(type = "linear", title = "NC")))
Binned_NWstats_meanphylop_3Dscatter
Admire how the points get lighter/more yellow from the outside towards the center…
Make Histograms for NC, BC, and ASPL
ggplot(original_data, aes(x = AverageShortestPathLength)) +
geom_histogram(binwidth = 0.1, fill = "#002051", color = "black") +
labs(title = "Distribution of ASPL")
ggplot(original_data, aes(x = NeighborhoodConnectivity)) +
geom_histogram(binwidth = 1, fill = "azure4", color = "black") +
labs(title = "Distribution of NC")
ggplot(original_data, aes(x = BetweennessCentrality)) +
geom_histogram(binwidth = 0.001, fill = "#fdea45", color = "black") +
labs(title = "Distribution of BC")
These predictors are on totally different scales which may influence some effect size calculations later. So they will first be scaled.
#make new dataframe
original_data2 <- original_data
original_data2$AverageShortestPathLength <- scale(original_data2$AverageShortestPathLength)
original_data2$NeighborhoodConnectivity <- scale(original_data2$NeighborhoodConnectivity)
original_data2$BetweennessCentrality <- scale(original_data2$BetweennessCentrality)
#save as new datasheet
write.csv(original_data2, "mammals_stats_scaled.csv", row.names=FALSE)
Test model assumptions and fit a linear model lm for the effect of ASPL, BC and NC on orderNorm transformed mean Phylop
model <- lm(orderNorm_meanPhylop ~ AverageShortestPathLength + NeighborhoodConnectivity + BetweennessCentrality, data = original_data2)
model.metrics <- augment(model)
model.metrics <- model.metrics[, !(names(model.metrics) %in% c(".hat", ".sigma", ".fitted"))]
head(model.metrics, 3)
## # A tibble: 3 × 8
## .rownames orderNorm_meanPhylop AverageShortestPathLen…¹ NeighborhoodConnecti…²
## <chr> <dbl> <dbl> <dbl>
## 1 1 0.198 -1.50 0.0970
## 2 2 -1.36 -2.02 0.517
## 3 3 -1.03 -1.78 0.337
## # ℹ abbreviated names: ¹AverageShortestPathLength[,1],
## # ²NeighborhoodConnectivity[,1]
## # ℹ 4 more variables: BetweennessCentrality <dbl[,1]>, .resid <dbl>,
## # .cooksd <dbl>, .std.resid <dbl>
Run a Shapiro test for deviations of residuals from Normality. The max rows for this are exceeded with the full dataset and to prevent p-value inflation, here only a subset of random rows is tested.
sampled_resid <- sample(model.metrics$.resid, 200)
shapiro_test(sampled_resid)
## # A tibble: 1 × 3
## variable statistic p.value
## <chr> <dbl> <dbl>
## 1 sampled_resid 0.994 0.564
The residuals don’t deviate from Normality Make a Q-Q plot for visually checking homogeneity of variances
qqnorm(sampled_resid)
qqline(sampled_resid)
Display the summary of the model
summary(model)
##
## Call:
## lm(formula = orderNorm_meanPhylop ~ AverageShortestPathLength +
## NeighborhoodConnectivity + BetweennessCentrality, data = original_data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0997 -0.6415 -0.0084 0.6227 4.5214
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.005948 0.007643 -0.778 0.4364
## AverageShortestPathLength -0.326063 0.009263 -35.202 <2e-16 ***
## NeighborhoodConnectivity -0.167005 0.008847 -18.877 <2e-16 ***
## BetweennessCentrality 0.018402 0.007599 2.422 0.0155 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9599 on 15780 degrees of freedom
## (1173 observations deleted due to missingness)
## Multiple R-squared: 0.07874, Adjusted R-squared: 0.07857
## F-statistic: 449.6 on 3 and 15780 DF, p-value: < 2.2e-16
To estimate overall effect size, calculate Cohen’s f squared for the entire model which is ratio of the explained variance to the residual variance
R_squared <- summary(model)$r.squared
f_squared <- R_squared / (1 - R_squared)
f_squared
## [1] 0.08547111
Calculate effect sizes for each predictor as Partial Eta-squared (η²), representing the proportion of variance in the dependent variable that is associated with a particular predictor, while controlling for the effects of other predictors in the model.
eta_squared <- eta_squared(model)
eta_squared
## AverageShortestPathLength NeighborhoodConnectivity BetweennessCentrality
## 0.0571201025 0.0212785302 0.0003423979
To visualise node position within the network better, nodes are categorized based on ASPL, BC and NC. Categories are Hub, Peripheral and Intermediate H, P and I.
Problem: The distributions of ASPL, BC and NC show skew, but most extreme values are not represented by large numbers of datapoints. For example, Hub nodes are characterised by large Betweeness Centrality, but the number of true hub nodes in the dataset is expected to be proportionally rather small. Therefore, methods to increase training data or to equally represent categories from the actual dataset such as removal of outliers or stratification, may lead to wrong classification results. We can implement a workaround called “synthetic data generation” or SMOTE (Synthetic Minoriry oversampling technique) which is a Machine Learning algorithm. Here, lines are drawn between two datapoints in the original dataset and new synthetic datapoints are generated along that line. This way, the areas of the distribution with low dots are “filled up”. The k parameter ensures this is only done between close neighbors to avoid deviating too much from the original distribution when doing so.
First we perform an initial classification, where nodes with extreme values of ASPL, BC and NC get binned into H, I and P categories. Note that less BC values are taken due to its extremely skewed distribution.
#open new dataset here just as a starting point to be able to repeat this a few times without having to delete out any columns. Otherwise commenting this out and continuing with the main analysis dataset is fine.
dataset <- read.csv('mammals_stats7_nodfa.csv')
dataset$id <- seq_len(nrow(dataset))
dataset$DFA_Nodecat <- NA
#using log BC here because of its underlying distribution causing issues with classification based on percentiles
dataset$log_BetweennessCentrality <- log(dataset$BetweennessCentrality + 1) # +1 to avoid log(0)
threshold_H <- quantile(dataset$log_BetweennessCentrality, probs = 0.99, na.rm = TRUE)
threshold_I <- quantile(dataset$NeighborhoodConnectivity, probs = 0.9, na.rm = TRUE)
threshold_P <- quantile(dataset$AverageShortestPathLength, probs = 0.9, na.rm = TRUE)
dataset$DFA_Nodecat <- ifelse(dataset$log_BetweennessCentrality > threshold_H, 'H',
ifelse(dataset$NeighborhoodConnectivity > threshold_I, 'I',
ifelse(dataset$AverageShortestPathLength > threshold_P, 'P', NA)))
dataset$DFA_Nodecat <- as.factor(dataset$DFA_Nodecat)
print("table of initial node category counts")
## [1] "table of initial node category counts"
table(dataset$DFA_Nodecat)
##
## H I P
## 170 1694 1696
train_set <- dataset[!is.na(dataset$DFA_Nodecat), ]
test_set <- dataset[is.na(dataset$DFA_Nodecat), ]
print("row count of train_set")
## [1] "row count of train_set"
print(nrow(train_set))
## [1] 3560
print("row count of test_set")
## [1] "row count of test_set"
print(nrow(test_set))
## [1] 13397
# Create a copy of this initially classified data for later use
initial_classified_copy <- train_set[!is.na(train_set$DFA_Nodecat), ]
SMOTE by default is applied to binary class problems - needs to be optimized for multi class problem (such as our H,I,P nodes). The majority class in train_set$DFA_Nodecat is first identified to distinguish it from the minority classes and then a loop iterates over each class level in DFA_Nodecat. For each class (except the majority class), a binary classification dataset is created, where the current class is treated as one class and all other classes are combined into a second class (“Other”). This is done to apply SMOTE separately to each minority class against the rest of the data.Then everything is merged back together to create a balanced training set.
train_set$DFA_Nodecat <- as.factor(train_set$DFA_Nodecat)
majority_class <- names(sort(table(train_set$DFA_Nodecat), decreasing = TRUE))[1]
balanced_train_set <- data.frame()
for (class in levels(train_set$DFA_Nodecat)) {
if (class != majority_class) {
binary_data <- train_set[, c('AverageShortestPathLength','BetweennessCentrality', 'log_BetweennessCentrality', 'NeighborhoodConnectivity')]
binary_target <- as.factor(ifelse(train_set$DFA_Nodecat == class, class, "Other"))
smote_data <- SMOTE(binary_data[, c('AverageShortestPathLength','BetweennessCentrality', 'NeighborhoodConnectivity')],
binary_target,
K = 5,
dup_size = 2)
if (is.list(smote_data) && "data" %in% names(smote_data)) {
combined_data <- smote_data$data
combined_data$DFA_Nodecat <- as.factor(combined_data$class)
balanced_train_set <- rbind(balanced_train_set, combined_data[combined_data$DFA_Nodecat == class, ])
}
}
}
balanced_train_set$class <- NULL
print("row count of balanced_train_set")
## [1] "row count of balanced_train_set"
print(nrow(balanced_train_set))
## [1] 5592
common_columns <- intersect(names(train_set), names(balanced_train_set))
initial_classified_copy2 <- train_set[!is.na(train_set$DFA_Nodecat), common_columns]
majority_class_data <- train_set[train_set$DFA_Nodecat == majority_class, common_columns, drop = FALSE]
balanced_train_set <- rbind(balanced_train_set, majority_class_data)
print("row count of balanced_train_set after binding majority data")
## [1] "row count of balanced_train_set after binding majority data"
print(nrow(balanced_train_set))
## [1] 7288
balanced_train_set <- balanced_train_set[balanced_train_set$DFA_Nodecat != "Other", ]
balanced_train_set$DFA_Nodecat <- droplevels(balanced_train_set$DFA_Nodecat)
balanced_train_set <- rbind(balanced_train_set, initial_classified_copy2)
print("Row count of balanced_train_set after adding initial_classified_copy:")
## [1] "Row count of balanced_train_set after adding initial_classified_copy:"
print(nrow(balanced_train_set))
## [1] 10848
table(balanced_train_set$DFA_Nodecat)
##
## H I P
## 680 6776 3392
print("row count of DFA_Nodecat in balanced_train_set after SMOTE and exclude other")
## [1] "row count of DFA_Nodecat in balanced_train_set after SMOTE and exclude other"
print(nrow(balanced_train_set))
## [1] 10848
Run SVM to assign nodes into categories based on ASPL, NC and BC. A polynomial Kernel is picked to better discriminate nodes based on the three predictors than a linear DFA would.
test_set <- dataset[is.na(dataset$DFA_Nodecat), ] #just make this again, in case some things got messed up along the way
test_set_for_prediction <- test_set[, c('AverageShortestPathLength','BetweennessCentrality', 'BetweennessCentrality','NeighborhoodConnectivity')]
svm_model <- svm(DFA_Nodecat ~ AverageShortestPathLength + BetweennessCentrality + NeighborhoodConnectivity, data = balanced_train_set,
kernel = "polynomial", degree = 2, coef0 = 2, gamma = 20)
svm_predictions <- predict(svm_model, newdata = test_set_for_prediction)
svm_predictions_vector <- as.vector(svm_predictions)
print(head(svm_predictions_vector))
## [1] "H" "I" "I" "I" "I" "I"
# Attach predictions back to the test_set and copy everything back together
test_set$Predicted_Class2 <- NA
test_set$Predicted_Class2 <- svm_predictions_vector
print("row count of test set_predicted class2 still missing the original training data")
## [1] "row count of test set_predicted class2 still missing the original training data"
print(nrow(test_set))
## [1] 13397
initial_classified_copy$Predicted_Class2 <- NULL
initial_classified_copy$Predicted_Class2 <- NA
initial_classified_copy$DFA_Nodecat <- as.factor(initial_classified_copy$DFA_Nodecat)
initial_classified_copy$Predicted_Class2 <- as.factor(initial_classified_copy$Predicted_Class2)
test_set$Predicted_Class2 <- as.factor(test_set$Predicted_Class2)
test_set <- rbind(test_set, initial_classified_copy)
test_set$Predicted_Class2[is.na(test_set$Predicted_Class2)] <- test_set$DFA_Nodecat[is.na(test_set$Predicted_Class2)]
test_set <- unique(test_set)
print("table of node category counts after SVM and adding training data")
## [1] "table of node category counts after SVM and adding training data"
print(nrow(test_set))
## [1] 16957
table(test_set$Predicted_Class2)
##
## H I P
## 348 12057 4552
It is not possible to validate this model since the initial training set was used to absolutely distinguish the categories, so if we would back- predict these with the balanced training set-derived model, the model will artificially look perfect due to it containing a circular argument. We can instead look at the classification decision values (further down), and visually inspect how points were classified based on ASPL, BC and NC.
Make a 3D scatterplot of the classification outcome
plot_ly(data = test_set, x = ~AverageShortestPathLength, y = ~BetweennessCentrality, z = ~NeighborhoodConnectivity,
type = 'scatter3d', mode = 'markers',
marker = list(size = 5, color = ~as.numeric(Predicted_Class2), colorscale = 'Viridis', opacity = 0.8),
text = ~paste('Predicted_Class2:', Predicted_Class2)) %>%
layout(title = '3D Scatterplot of scaled ASPL, BC,NC',
scene = list(xaxis = list(title = 'ASPL'),
yaxis = list(title = 'BC'),
zaxis = list(title = 'NC')))
Visually compare constraint values between node categories through a Means-with-error box plot on both normalized and non-normalized mean Phylop
original_data2 <- test_set
ON_meanPhylop <- orderNorm(original_data2$meanPhylop)
orderNorm_meanPhylop <- predict(ON_meanPhylop)
original_data2 <- cbind(original_data2, orderNorm_meanPhylop)
plot_data <- original_data2
lower_quantile <- quantile(plot_data$orderNorm_meanPhylop, probs = 0.05, na.rm = TRUE)
upper_quantile <- quantile(plot_data$orderNorm_meanPhylop, probs = 0.95, na.rm = TRUE)
assign_combined_color <- function(value, lower_quantile, upper_quantile, lower_color, upper_color, middle_color_scale) {
if (is.na(value)) {
return(NA) # Keep NA if the value is NA
} else if (value <= lower_quantile) {
return(lower_color)
} else if (value >= upper_quantile) {
return(upper_color)
} else {
normalized_value <- (value - lower_quantile) / (upper_quantile - lower_quantile)
return(middle_color_scale(normalized_value))
}
}
middle_color_scale <- scales::col_numeric(palette = viridis_pal(option = "cividis")(100), domain = c(0, 1))
plot_data$combined_color <- mapply(assign_combined_color,
plot_data$orderNorm_meanPhylop,
MoreArgs = list(
lower_quantile = lower_quantile,
upper_quantile = upper_quantile,
lower_color = "#002051",
upper_color = "#fdea45",
middle_color_scale = middle_color_scale
))
Nodecat_on_meanphylop_meplot_svm <- ggplot(plot_data, aes(x = Predicted_Class2, y = orderNorm_meanPhylop)) +
geom_jitter(aes(colour = combined_color), alpha = 0.3, shape = 16, size=4) +
scale_color_identity() + # Use the actual color values
stat_summary(fun.data = "mean_sdl", fun.args = list(mult = 1),
geom = "pointrange", color = "black", size = 1.75, alpha = 1.0) +
theme_minimal() +
labs(title = "All data oN mean PhyloP", x = "Node category SVM", y = "oN mean PhyloP") +
geom_boxplot(outlier.shape = NA, alpha=0.01, size=1.25)
print(Nodecat_on_meanphylop_meplot_svm)
ggsave("Nodecat_on_meanphylop_meplot_svm_combined_color.pdf", plot = Nodecat_on_meanphylop_meplot_svm, width = 5, height = 10)
We see that the distribution of constraint values matches the hypothesis.
However, there may be some nodes that are classified, but don’t have strong support for either class. We have a closer look at the output of the SVM based classification through looking at the decision.values object. Here and elsewhere, we always count the rows of the dataset to keep track of what’s included. Loosely speaking, decision values of >+1 and <-1 (representing the hyperplane dividing the categories), are considered as well supported, but there is no exact cut-off point. Here we assume that values closer to zero than +0.7 and -0.7 should not be classified as H,I or P but instead as “Uncertain”. We go with the same procedure as the SVM and do a majority vote (if at least 2 out of the three decision vectors fall below the threshold the node is classified as uncertain).
# make a new test_set not to disturb the other one we just made
test_set2 <- dataset[is.na(dataset$DFA_Nodecat), ]
print("row count of test_set")
## [1] "row count of test_set"
print(nrow(test_set2))
## [1] 13397
svm_predictions2 <- predict(svm_model, newdata = test_set_for_prediction, decision.values = TRUE)
svm_predictions_vector <- as.vector(svm_predictions2)
decision_values <- attr(svm_predictions2, "decision.values")
print(str(decision_values))
## num [1:13397, 1:3] 1.64 -1.48 -3.99 -2.73 -3.57 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:13397] "1" "2" "3" "4" ...
## ..$ : chr [1:3] "H/I" "H/P" "I/P"
## NULL
if (is.matrix(decision_values) || is.data.frame(decision_values)) {
colnames(decision_values) <- gsub("/", "_", colnames(decision_values))
print(colnames(decision_values))
} else {
print("decision_values is not a matrix or data frame.")
}
## [1] "H_I" "H_P" "I_P"
decision_values <- as.data.frame(decision_values)
# Attach predictions and decision values back to the test_set
test_set2$Predicted_Class3 <- svm_predictions_vector
test_set2$decision_values_HI <- NA #housekeeping just in case
test_set2$decision_values_HP <- NA
test_set2$decision_values_IP <- NA
test_set2$decision_values_HI <- decision_values$H_I
test_set2$decision_values_HP <- decision_values$H_P
test_set2$decision_values_IP <- decision_values$I_P
test_set2$decision_values_HI <- as.numeric(test_set2$decision_values_HI)
test_set2$decision_values_HP <- as.numeric(test_set2$decision_values_HP)
test_set2$decision_values_IP <- as.numeric(test_set2$decision_values_IP)
# Perform the reclassification of weakly supported nodes as Uncertain.
test_set2$Predicted_Class4 <- test_set2$Predicted_Class3
confidence_threshold <- 0.7
for (i in 1:nrow(test_set2)) {
values <- c(test_set2[i, "decision_values_HI"],
test_set2[i, "decision_values_HP"],
test_set2[i, "decision_values_IP"])
values_below_threshold <- sum(abs(values) < confidence_threshold)
if (values_below_threshold >= 2) {
test_set2$Predicted_Class4[i] <- 'Uncertain'
}
}
head(test_set2$Predicted_Class4)
## [1] "H" "I" "I" "I" "I" "I"
print("row count of test set2 still missing the original training data")
## [1] "row count of test set2 still missing the original training data"
print(nrow(test_set2))
## [1] 13397
table(test_set2$Predicted_Class4)
##
## H I P Uncertain
## 136 9358 2856 1047
# Add training data back in and merge subsets
initial_classified_copy$Predicted_Class4 <- NA
initial_classified_copy$decision_values_HI <- NA
initial_classified_copy$decision_values_HP <- NA
initial_classified_copy$decision_values_IP <- NA
test_set2$DFA_Nodecat <- NULL
test_set2$DFA_Nodecat <- NA
test_set2$Predicted_Class4 <- as.factor(test_set2$Predicted_Class4)
common_cols <- intersect(names(test_set2), names(initial_classified_copy))
test_set2_common <- test_set2[, common_cols]
initial_classified_copy_common <- initial_classified_copy[, common_cols]
test_set3 <- rbind(test_set2_common, initial_classified_copy_common)
test_set3$Predicted_Class4[is.na(test_set3$Predicted_Class4)] <- test_set3$DFA_Nodecat[is.na(test_set3$Predicted_Class4)]
print("row count of test set2 after adding the original training data")
## [1] "row count of test set2 after adding the original training data"
print(nrow(test_set3))
## [1] 16957
table(test_set3$Predicted_Class4)
##
## H I P Uncertain
## 306 11052 4552 1047
Plot the decision values and the new distribution of mean Phylop after declassing the Uncertain nodes.
test_set3$Predicted_Class4 <- factor(test_set3$Predicted_Class4)
num_classes <- length(unique(test_set3$Predicted_Class4))
viridis_colors <- viridis::viridis(num_classes)
color_mapping <- setNames(viridis_colors, unique(test_set3$Predicted_Class4))
fig2 <- plot_ly(data = test_set3,
x = ~decision_values_HI,
y = ~decision_values_HP,
z = ~decision_values_IP,
type = 'scatter3d',
mode = 'markers',
marker = list(size = 5,
color = ~color_mapping[Predicted_Class4],
showscale = FALSE, opacity=0.5)) # Remove colorbar
fig2 <- fig2 %>% layout(title = "3D Scatter Plot of Decision Values",
scene = list(xaxis = list(title = 'Decision Value H/I'),
yaxis = list(title = 'Decision Value H/P'),
zaxis = list(title = 'Decision Value I/P')))
fig2
#just redoing the order Norm transformation
"orderNorm_meanPhylop" %in% names(test_set3)
## [1] FALSE
test_set3$orderNorm_meanPhylop <- NULL
"orderNorm_meanPhylop" %in% names(test_set3)
## [1] FALSE
ON_meanPhylop <- orderNorm(test_set3$meanPhylop)
orderNorm_meanPhylop <- predict(ON_meanPhylop)
test_set3 <- cbind(test_set3, orderNorm_meanPhylop)
"orderNorm_meanPhylop" %in% names(test_set3)
## [1] TRUE
plot_data2 <- test_set3
lower_quantile <- quantile(plot_data2$orderNorm_meanPhylop, probs = 0.05, na.rm = TRUE)
upper_quantile <- quantile(plot_data2$orderNorm_meanPhylop, probs = 0.95, na.rm = TRUE)
assign_combined_color <- function(value, lower_quantile, upper_quantile, lower_color, upper_color, middle_color_scale) {
if (is.na(value)) {
return(NA) # Keep NA if the value is NA
} else if (value <= lower_quantile) {
return(lower_color)
} else if (value >= upper_quantile) {
return(upper_color)
} else {
normalized_value <- (value - lower_quantile) / (upper_quantile - lower_quantile)
return(middle_color_scale(normalized_value))
}
}
middle_color_scale <- scales::col_numeric(palette = viridis_pal(option = "cividis")(100), domain = c(0, 1))
plot_data2$combined_color <- mapply(assign_combined_color,
plot_data2$orderNorm_meanPhylop,
MoreArgs = list(
lower_quantile = lower_quantile,
upper_quantile = upper_quantile,
lower_color = "#002051",
upper_color = "#fdea45",
middle_color_scale = middle_color_scale
))
Nodecat_on_meanphylop_meplot_uncertain_svm <- ggplot(plot_data2, aes(x = Predicted_Class4, y = orderNorm_meanPhylop)) +
geom_jitter(aes(colour = combined_color), alpha = 0.3, shape = 16, size=4) +
scale_color_identity() + # Use the actual color values
stat_summary(fun.data = "mean_sdl", fun.args = list(mult = 1),
geom = "pointrange", color = "black", size = 1.75, alpha = 1.0) +
theme_minimal() +
scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
labs(title = "Uncertain oN mean Phylop", x = "Node category SVM", y = "oN mean Phylop") +
geom_boxplot(outlier.shape = NA, alpha=0.01, size=1.25)
print(Nodecat_on_meanphylop_meplot_uncertain_svm)
ggsave("Nodecat_on_meanphylop_meplot_svm_uncertain.pdf", plot = Nodecat_on_meanphylop_meplot_uncertain_svm, width = 5, height = 10)
write.csv(test_set3, "mammals_stats_predicted_Classes.csv", row.names=FALSE)
remove the uncertain nodes from the following analysis
test_set4 <- test_set3[test_set3$Predicted_Class4 != "Uncertain", ]
test_set4$Predicted_Class4 <- droplevels(test_set4$Predicted_Class4)
print("row count of test set4 with uncertain node rows removed")
## [1] "row count of test set4 with uncertain node rows removed"
print(nrow(test_set4))
## [1] 15910
table(test_set4$Predicted_Class4)
##
## H I P
## 306 11052 4552
Perform ANOVA to test for effect of node class on orderNorm transformed mean Phylop
anova_result <- aov(orderNorm_meanPhylop ~ Predicted_Class4, data = test_set4)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## Predicted_Class4 2 428 213.77 218.6 <2e-16 ***
## Residuals 14825 14500 0.98
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1082 observations deleted due to missingness
Node class significantly predicts oN meanPhylop
Perform post-hoc Emmeans test
anova_result2 <- aov(orderNorm_meanPhylop ~ Predicted_Class4, data = test_set4)
anova_posthoc_pairs <- summary(emmeans(anova_result2,
pairwise ~ Predicted_Class4, adjust="fdr")$contrast) %>%
as.data.frame()
anova_posthoc_pairs %>%
kbl(digits = 4,
caption = "Pairwise tests of oN_meanPhylop by node cateogry FDR adjustment") %>%
kable_classic(full_width = F, html_font = "Cambria")
| contrast | estimate | SE | df | t.ratio | p.value |
|---|---|---|---|---|---|
| H - I | 0.5052 | 0.0582 | 14825 | 8.6813 | 0 |
| H - P | 0.8364 | 0.0594 | 14825 | 14.0751 | 0 |
| I - P | 0.3312 | 0.0182 | 14825 | 18.1773 | 0 |
Since the p-values could be influenced by the large sample sizes, we add eta-squared for effect of the whole model, including the Evidence interpretations of eta-squared by Cohen 1992.
anova_result <- car::Anova(aov(orderNorm_meanPhylop ~ Predicted_Class4, data = test_set4), type = 2)
anova_result_df <- tidy(anova_result) %>%
as.data.frame()
# Table summary
anova_result_df <- sjstats::anova_stats(car::Anova(aov(orderNorm_meanPhylop ~ Predicted_Class4, data = test_set4), type = 2)) %>%
dplyr::select(-c(etasq, omegasq, epsilonsq, power, cohens.f))
# Effect size
anova_result_df$eta2.int=interpret_eta_squared(
anova_result_df$partial.etasq,
rules="cohen1992")
anova_result_df <- anova_result_df %>%
dplyr::rename("P"="p.value") %>%
mutate("Significance" = case_when(
P >= 0.05 ~ "",
P >= 0.01 ~ "*",
P >= 0.001 ~ "**",
P >= 0.0001 ~ "***",
P < 0.0001 ~ "****",
TRUE ~ NA
)) %>%
mutate("Evidence" = case_when(
P >= 0.1 ~ "Little or none",
P >= 0.05 ~ "Weak",
P >= 0.01 ~ "Moderate",
P >= 0.001 ~ "Strong",
P < 0.0001 ~ "Very strong",
TRUE ~ NA
))
anova_result_df %>%
kbl(digits = 4,
caption = "Analysis of variance for orderNorm meanPhylop - full data") %>%
kable_classic(full_width = F, html_font = "Cambria")
| term | sumsq | meansq | df | statistic | P | partial.etasq | partial.omegasq | eta2.int | Significance | Evidence | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Predicted_Class4 | Predicted_Class4 | 427.544 | 213.772 | 2 | 218.566 | 0 | 0.029 | 0.029 | small | **** | Very strong |
| …2 | Residuals | 14499.801 | 0.978 | 14825 | NA | NA | NA | NA | NA | NA | NA |
The effect is small but significant and evidence is very strong.
Since the p-values could be influenced by the large sample sizes, we add Cohens D for effect size of pairwise comparisons, and using the Evidence interpretations of Cohen’s D amd Eta squared as mentioned in Sawilovsky, 2009.
# Effect size
anova_posthoc_pairs4 <- anova_posthoc_pairs %>%
dplyr::rename("P"="p.value") %>%
mutate("Significance" = case_when(
P >= 0.05 ~ "",
P >= 0.01 ~ "*",
P >= 0.001 ~ "**",
P >= 0.0001 ~ "***",
P < 0.0001 ~ "****",
TRUE ~ NA
)) %>%
mutate("Evidence" = case_when(
P >= 0.1 ~ "Little or none",
P >= 0.05 ~ "Weak",
P >= 0.01 ~ "Moderate",
P >= 0.001 ~ "Strong",
P < 0.0001 ~ "Very strong",
TRUE ~ NA
))
anova_posthoc_pairs4$d <- t_to_d(t = anova_posthoc_pairs4$t.ratio,
df = 103)[1]$d
anova_posthoc_pairs4$r <- t_to_r(t = anova_posthoc_pairs4$t.ratio,
df = 103)[1]$r
anova_posthoc_pairs4$O2 <- t_to_omega2(t = anova_posthoc_pairs4$t.ratio,
df = 103)[1]$'Omega2 (partial)'
anova_posthoc_pairs4$eta2 <- t_to_eta2(t = anova_posthoc_pairs4$t.ratio,
df = 103)[1]$Eta2_partial
anova_posthoc_pairs4$d.int=interpret_cohens_d(anova_posthoc_pairs4$d,
rules="sawilowsky2009")
anova_posthoc_pairs4$eta2.int=interpret_eta_squared(anova_posthoc_pairs4$eta2,
rules="cohen1992")
anova_posthoc_pairs4 %>%
kbl(digits = 4,caption = "Pairwise effect sizes for Nodecat" ) %>%
kable_classic(full_width = F, html_font = "Cambria")
| contrast | estimate | SE | df | t.ratio | P | Significance | Evidence | d | r | eta2 | d.int | eta2.int |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| H - I | 0.5052 | 0.0582 | 14825 | 8.6813 | 0 | **** | Very strong | 1.7108 | 0.6500 | 0.4225 | very large | large |
| H - P | 0.8364 | 0.0594 | 14825 | 14.0751 | 0 | **** | Very strong | 2.7737 | 0.8111 | 0.6579 | huge | large |
| I - P | 0.3312 | 0.0182 | 14825 | 18.1773 | 0 | **** | Very strong | 3.5821 | 0.8731 | 0.7624 | huge | large |
The node categories all have significant pairwise differences in constraint with a very large to huge effect size.
Here we first remove the genes which aren’t relevant for Hibernation.
test_set4$Direction <- as.factor(test_set4$Direction)
table(test_set4$Direction)
##
## 1 slower 2 faster
## 12997 2896 8 9
rows_with_empty_level <- which(test_set4$Direction == "")
rm_Dir <- test_set4[-rows_with_empty_level, ]
levels(rm_Dir$Direction)
## [1] "" " " "1 slower" "2 faster"
rm_Dir$Direction[rm_Dir$Direction == ""] <- NA
rm_Dir$Direction[rm_Dir$Direction == " "] <- NA
rm_Dir$Direction <- droplevels(rm_Dir$Direction)
rm_Dir <- rm_Dir[!is.na(rm_Dir$Direction), ]
levels(rm_Dir$Direction)
## [1] "1 slower" "2 faster"
table(rm_Dir$Predicted_Class4)
##
## H I P
## 1 12 4
The majority of the hibernation genes are in the I node category as expected
Direction_oN_mean_phylop_box <- ggplot(rm_Dir, aes(x = Direction, y = orderNorm_meanPhylop)) +
theme_minimal() +
geom_jitter(aes(shape = Predicted_Class4, color = Predicted_Class4), alpha = 0.6, size = 6, width=0.2) +
scale_color_viridis_d() +
geom_boxplot(aes(fill = Direction), outlier.shape = NA, alpha = 0.3) +
labs(title = "All data oN mean Phylop", x = "Direction", y = "oN mean PhyloP") +
scale_fill_viridis_d() +
scale_shape_manual(values = c(16, 17, 15)) # Closed shapes
print(Direction_oN_mean_phylop_box)
ggsave("Direction_oN_mean_phylop_box.pdf", Direction_oN_mean_phylop_box, width = 5, height = 10)
Genes accelerating evolutionary rate in aligned with the emergence of hibernation are more constrained than genes lowering their evolutionary rate in aligned with the emergene of hibernation
Fit a linear model for the effect of Node category and direction of rate change rho on Hibernation-relevant genes, followed by post hoc tests and effect size estimation as described above.
anova_result <- car::Anova(aov(orderNorm_meanPhylop ~ Direction + Predicted_Class4, data = rm_Dir), type = 2)
anova_result_df <- tidy(anova_result) %>%
as.data.frame()
anova_result_df <- sjstats::anova_stats(car::Anova(aov(orderNorm_meanPhylop ~ Direction + Predicted_Class4, data = rm_Dir), type = 2)) %>%
dplyr::select(-c(etasq, omegasq, epsilonsq, power, cohens.f))
anova_result_df$eta2.int=interpret_eta_squared(
anova_result_df$partial.etasq,
rules="cohen1992")
anova_result_df <- anova_result_df %>%
dplyr::rename("P"="p.value") %>%
mutate("Significance" = case_when(
P >= 0.05 ~ "",
P >= 0.01 ~ "*",
P >= 0.001 ~ "**",
P >= 0.0001 ~ "***",
P < 0.0001 ~ "****",
TRUE ~ NA
)) %>%
mutate("Evidence" = case_when(
P >= 0.1 ~ "Little or none",
P >= 0.05 ~ "Weak",
P >= 0.01 ~ "Moderate",
P >= 0.001 ~ "Strong",
P < 0.0001 ~ "Very strong",
TRUE ~ NA
))
anova_result_df %>%
kbl(digits = 4,
caption = "Analysis of variance for orderNorm meanPhylop by Direction - full data") %>%
kable_classic(full_width = F, html_font = "Cambria")
| term | sumsq | meansq | df | statistic | P | partial.etasq | partial.omegasq | eta2.int | Significance | Evidence | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Direction | Direction | 5.163 | 5.163 | 1 | 6.229 | 0.027 | 0.324 | 0.235 | large |
|
Moderate |
| Predicted_Class4 | Predicted_Class4 | 1.545 | 0.773 | 2 | 0.932 | 0.419 | 0.125 | -0.008 | small | Little or none | |
| …3 | Residuals | 10.776 | 0.829 | 13 | NA | NA | NA | NA | NA | NA | NA |
Since the p-values could be influenced by the large sample sizes, we add Cohens D for effect size of pairwise comparisons, and using the Evidence interpretations of Cohen’s D amd Eta squared as mentioned in Sawilovsky, 2009.
# Effect size
anova_posthoc_pairs2 <- anova_posthoc_pairs %>%
dplyr::rename("P"="p.value") %>%
mutate("Significance" = case_when(
P >= 0.05 ~ "",
P >= 0.01 ~ "*",
P >= 0.001 ~ "**",
P >= 0.0001 ~ "***",
P < 0.0001 ~ "****",
TRUE ~ NA
)) %>%
mutate("Evidence" = case_when(
P >= 0.1 ~ "Little or none",
P >= 0.05 ~ "Weak",
P >= 0.01 ~ "Moderate",
P >= 0.001 ~ "Strong",
P < 0.0001 ~ "Very strong",
TRUE ~ NA
))
anova_posthoc_pairs2$d <- t_to_d(t = anova_posthoc_pairs2$t.ratio,
df = 103)[1]$d
anova_posthoc_pairs2$r <- t_to_r(t = anova_posthoc_pairs2$t.ratio,
df = 103)[1]$r
anova_posthoc_pairs2$O2 <- t_to_omega2(t = anova_posthoc_pairs2$t.ratio,
df = 103)[1]$'Omega2 (partial)'
anova_posthoc_pairs2$eta2 <- t_to_eta2(t = anova_posthoc_pairs2$t.ratio,
df = 103)[1]$Eta2_partial
anova_posthoc_pairs2$d.int=interpret_cohens_d(anova_posthoc_pairs2$d,
rules="sawilowsky2009")
anova_posthoc_pairs2$eta2.int=interpret_eta_squared(anova_posthoc_pairs2$eta2,
rules="cohen1992")
anova_posthoc_pairs2 %>%
kbl(digits = 4,caption = "Pairwise effect sizes for Direction" ) %>%
kable_classic(full_width = F, html_font = "Cambria")
| contrast | estimate | SE | df | t.ratio | P | Significance | Evidence | d | r | eta2 | d.int | eta2.int |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| H - I | 0.5052 | 0.0582 | 14825 | 8.6813 | 0 | **** | Very strong | 1.7108 | 0.6500 | 0.4225 | very large | large |
| H - P | 0.8364 | 0.0594 | 14825 | 14.0751 | 0 | **** | Very strong | 2.7737 | 0.8111 | 0.6579 | huge | large |
| I - P | 0.3312 | 0.0182 | 14825 | 18.1773 | 0 | **** | Very strong | 3.5821 | 0.8731 | 0.7624 | huge | large |
We can compare the distribution of ASPL, NC, BC, and DFA_nodecat of 1000 iterations of 18 randomly picked genes, and the actual values of these genes. Differences in distributions are assessed with Kolmogorov-Smirnov tests.
set.seed(123)
table(test_set3$Direction)
##
## 1 slower 2 faster
## 13850 3089 8 10
filtered_data3 <- test_set3 %>%
filter(Direction %in% c("1 slower", "2 faster"))
num_samples <- 1000
sample_size <- 18
sample_means <- replicate(num_samples, {
sample_rows <- sample(1:nrow(test_set3), size = sample_size)
sampled_data <- test_set3$AverageShortestPathLength[sample_rows]
mean(sampled_data)
}, simplify = "data.frame")
sample_means_df <- data.frame(AverageShortestPathLength = sample_means)
ks_test_1slower_vs_sample_means <- ks.test(filtered_data3$AverageShortestPathLength[filtered_data3$Direction == "1 slower"], sample_means_df$AverageShortestPathLength)
ks_test_2faster_vs_sample_means <- ks.test(filtered_data3$AverageShortestPathLength[filtered_data3$Direction == "2 faster"],sample_means_df$AverageShortestPathLength)
ks_test_hib_vs_sample_means <-ks.test(filtered_data3$AverageShortestPathLength, sample_means_df$AverageShortestPathLength)
ks_test_results <- data.frame(
Test = c("1 slower vs. simulated means", "2 faster vs. simulated means", "hibernationgenes vs. simulated means"),
Statistic = c(ks_test_1slower_vs_sample_means$statistic,
ks_test_2faster_vs_sample_means$statistic,
ks_test_hib_vs_sample_means$statistic),
P_Value = c(ks_test_1slower_vs_sample_means$p.value,
ks_test_2faster_vs_sample_means$p.value,
ks_test_hib_vs_sample_means$p.value)
)
kable(ks_test_results, caption = "Kolmogorov-Smirnov Test Results for ASPL")
| Test | Statistic | P_Value |
|---|---|---|
| 1 slower vs. simulated means | 0.3740000 | 0.1675508 |
| 2 faster vs. simulated means | 0.3000000 | 0.3349494 |
| hibernationgenes vs. simulated means | 0.3323333 | 0.0402521 |
set.seed(123)
num_samples <- 1000
sample_size <- 18
sample_means <- replicate(num_samples, {
sample_rows <- sample(1:nrow(test_set3), size = sample_size)
sampled_data <- test_set3$BetweennessCentrality[sample_rows]
mean(sampled_data)
}, simplify = "data.frame")
sample_means_df <- data.frame(BetweennessCentrality = sample_means)
ks_test_1slower_vs_sample_means <- ks.test(filtered_data3$BetweennessCentrality[filtered_data3$Direction == "1 slower"],
sample_means_df$BetweennessCentrality)
ks_test_2faster_vs_sample_means <- ks.test(filtered_data3$BetweennessCentrality[filtered_data3$Direction == "2 faster"],
sample_means_df$BetweennessCentrality)
ks_test_hib_vs_sample_means <- ks.test(filtered_data3$BetweennessCentrality,
sample_means_df$BetweennessCentrality)
ks_test_results <- data.frame(
Test = c("1 slower vs. simulated means", "2 faster vs. simulated means", "hibernationgenes vs. simulated means"),
Statistic = c(ks_test_1slower_vs_sample_means$statistic,
ks_test_2faster_vs_sample_means$statistic,
ks_test_hib_vs_sample_means$statistic),
P_Value = c(ks_test_1slower_vs_sample_means$p.value,
ks_test_2faster_vs_sample_means$p.value,
ks_test_hib_vs_sample_means$p.value)
)
kable(ks_test_results, caption = "Kolmogorov-Smirnov Test Results for BC")
| Test | Statistic | P_Value |
|---|---|---|
| 1 slower vs. simulated means | 0.5000000 | 0.0235732 |
| 2 faster vs. simulated means | 0.5010000 | 0.0138819 |
| hibernationgenes vs. simulated means | 0.4565556 | 0.0012581 |
set.seed(123)
sample_means <- replicate(num_samples, {
sample_rows <- sample(1:nrow(test_set3), size = sample_size)
sampled_data <- test_set3$NeighborhoodConnectivity[sample_rows]
mean(sampled_data)
}, simplify = "data.frame")
sample_means_df <- data.frame(NeighborhoodConnectivity = sample_means)
ks_test_1slower_vs_sample_means <- ks.test(filtered_data3$NeighborhoodConnectivity[filtered_data3$Direction == "1 slower"],
sample_means_df$NeighborhoodConnectivity)
ks_test_2faster_vs_sample_means <- ks.test(filtered_data3$NeighborhoodConnectivity[filtered_data3$Direction == "2 faster"],
sample_means_df$NeighborhoodConnectivity)
ks_test_hib_vs_sample_means <- ks.test(filtered_data3$NeighborhoodConnectivity,
sample_means_df$NeighborhoodConnectivity)
ks_test_results <- data.frame(
Test = c("1 slower vs. simulated means", "2 faster vs. simulated means", "hibernationgenes vs. simulated means"),
Statistic = c(ks_test_1slower_vs_sample_means$statistic,
ks_test_2faster_vs_sample_means$statistic,
ks_test_hib_vs_sample_means$statistic),
P_Value = c(ks_test_1slower_vs_sample_means$p.value,
ks_test_2faster_vs_sample_means$p.value,
ks_test_hib_vs_sample_means$p.value)
)
kable(ks_test_results, caption = "Kolmogorov-Smirnov Test Results for NC")
| Test | Statistic | P_Value |
|---|---|---|
| 1 slower vs. simulated means | 0.382 | 0.1508962 |
| 2 faster vs. simulated means | 0.654 | 0.0004195 |
| hibernationgenes vs. simulated means | 0.499 | 0.0002998 |
set.seed(123)
sample_means <- replicate(num_samples, {
sample_rows <- sample(1:nrow(test_set3), size = sample_size)
sampled_data <- test_set3$meanPhylop[sample_rows]
mean(sampled_data)
}, simplify = "data.frame")
sample_means_df <- data.frame(meanPhylop = sample_means)
ks_test_1slower_vs_sample_means <- ks.test(filtered_data3$meanPhylop[filtered_data3$Direction == "1 slower"],
sample_means_df$meanPhylop)
ks_test_2faster_vs_sample_means <- ks.test(filtered_data3$meanPhylop[filtered_data3$Direction == "2 faster"],
sample_means_df$meanPhylop)
ks_test_hib_vs_sample_means <- ks.test(filtered_data3$meanPhylop,
sample_means_df$meanPhylop)
ks_test_results <- data.frame(
Test = c("1 slower vs. simulated means", "2 faster vs. simulated means", "hibernationgenes vs. simulated means"),
Statistic = c(ks_test_1slower_vs_sample_means$statistic,
ks_test_2faster_vs_sample_means$statistic,
ks_test_hib_vs_sample_means$statistic),
P_Value = c(ks_test_1slower_vs_sample_means$p.value,
ks_test_2faster_vs_sample_means$p.value,
ks_test_hib_vs_sample_means$p.value)
)
kable(ks_test_results, caption = "Kolmogorov-Smirnov Test Results for meanPhylop")
| Test | Statistic | P_Value |
|---|---|---|
| 1 slower vs. simulated means | 0.3636364 | 0.2028744 |
| 2 faster vs. simulated means | 0.7848485 | 0.0000009 |
| hibernationgenes vs. simulated means | 0.5808081 | 0.0000069 |